library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(readr)
library(viridis)
## Loading required package: viridisLite
library(leaflet)
knitr::opts_chunk$set(
    echo = TRUE,
    warning = FALSE,
    fig.width = 8, 
  fig.height = 6,
  out.width = "90%"
)
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
theme_set(theme_minimal() + theme(legend.position = "bottom"))
data_2018 = 
  read_csv("./data/2018data.csv") %>% 
  janitor::clean_names() 
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   TIME = col_time(format = ""),
##   `ZIP CODE` = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   `NUMBER OF PERSONS INJURED` = col_double(),
##   `NUMBER OF PERSONS KILLED` = col_double(),
##   `NUMBER OF PEDESTRIANS INJURED` = col_double(),
##   `NUMBER OF PEDESTRIANS KILLED` = col_double(),
##   `NUMBER OF CYCLIST INJURED` = col_double(),
##   `NUMBER OF CYCLIST KILLED` = col_double(),
##   `NUMBER OF MOTORIST INJURED` = col_double(),
##   `NUMBER OF MOTORIST KILLED` = col_double(),
##   COLLISION_ID = col_double()
## )
## See spec(...) for full column specifications.
newnames = colnames(data_2018) %>% 
  str_replace("number_of_","") 
names(data_2018) = newnames

tidy_data = 
data_2018 %>% 
  separate(date, into = c("month", "day", "year"), sep = "/") %>% 
  separate(time, into = c("hour", "minute"), sep = ":") %>%
  select(-zip_code, -location, -on_street_name, -cross_street_name, -off_street_name,-collision_id,-year) %>% 
  rename("vehicle_type" = "vehicle_type_code_1") %>% 
  mutate( day = as.numeric(day),
          month = as.numeric(month),
          hour = as.numeric(hour),
          minute = as.numeric(minute),
          latitude = replace_na(latitude,0),
          vehicle_type = str_to_lower(vehicle_type)
  ) %>%
  filter( latitude != 0)

Vehicle type

vehicle_type_data = 
  tidy_data %>% 
  mutate(
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"truck"),"truck"),
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"sport utility"),"sport utility vehicle")
    ) %>% 
  filter( vehicle_type %in% c("taxi","passenger vehicle","truck","sport utility vehicle")) %>% 
  group_by(vehicle_type,hour) %>% 
  summarize(
    n = n()
  )

vehicle_type_data %>% 
  plot_ly(
    x = ~hour, y = ~n, color = ~vehicle_type, type = "scatter", mode = "line") %>% 
  layout(
    title = "Collisions of Day for Different Vehicles",
    xaxis = list(title = "Hour of Day"),
    yaxis = list(title = "Collisions")
    )

Top 8 Collision Reasons

reason_data = 
  tidy_data %>%
  group_by(contributing_factor_vehicle_1) %>%
  summarize(n = n()) %>% 
  arrange(desc(n)) %>% 
  head(10)

reason_data %>% 
  plot_ly(x = ~reorder(contributing_factor_vehicle_1,desc(n)), y = ~n, color = ~contributing_factor_vehicle_1 ,type = "bar") %>% 
  layout(
    title = "The Number of Items Ordered in Each Aisle",
    xaxis = list(title = "Different Reasons"),
    yaxis = list(title = "Count")
    )

Mapping

data_2018 = tidy_data

data_2018 = rename(data_2018, long = latitude, lat = longitude)

pal <- colorNumeric(
  palette = "viridis",
  domain = data_2018$persons_injured)

data_2018 %>% 
  filter(!(lat < "-70" | lat >= "-75")) %>% 
  filter(persons_injured > 2) %>% 
    mutate(
    label = str_c("<b>vehicle type: ", vehicle_type, "</b><br>Month: ", month , sep = "") ) %>% 
  sample_n(2000) %>% 
  leaflet() %>% 
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addLegend("bottomright", pal = pal, values = ~persons_injured,
    title = "Persons Injured",
    opacity = 1
  ) %>% 
  addCircleMarkers(
    ~lat, ~long,
    color = ~pal(persons_injured),
    radius = 0.5,
    popup = ~ label) 
data_2018 %>% 
  group_by(borough) %>% 
  summarise(n())
## # A tibble: 6 x 2
##   borough       `n()`
##   <chr>         <int>
## 1 BRONX         22121
## 2 BROOKLYN      46314
## 3 MANHATTAN     29728
## 4 QUEENS        40400
## 5 STATEN ISLAND  5988
## 6 <NA>          71583